This notebook explores different words-specific stats together with their visualization.
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import sys
sys.path.append(os.path.join(os.getcwd(), "src"))
import util.io as mio
import util.plotting as mplot
from model.conversationDataframe import ConversationDataframe
from stats.wordsCountStats import WordsCountStats
from stats.iConvStats import IConvStats
from util import statsUtil
%matplotlib notebook
sns.set_context("paper")
%load_ext autoreload
%autoreload 2
In [2]:
# filepath to your conversation file. You can use my conversations examples for testing
FILEPATH = "src/resources/unittest/test_plotting.txt"
# load conversation
conv = ConversationDataframe(FILEPATH)
conv.loadMessages()
msgs = conv.messages
msgs.head()
Out[2]:
Consider ovarall word usage comparisong between senders
In [3]:
# load sender stats
sender_stats = conv.stats.generateStats(IConvStats.STATS_NAME_WORDCOUNT, ngram_range=(1,1))
sender_word_count = sender_stats.wordsCount
sender_word_count.head(3)
Out[3]:
Most frequent words, overall (total) and sender specific.
In [4]:
total_word_count = sender_stats.getWordsCount()
total_word_count.sort_values(ascending=False)[:5]
Out[4]:
In [5]:
sender_stats.getWordsCount(sender='s1').sort_values(ascending=False)[:5]
Out[5]:
Words with higher frequency difference between senders
In [6]:
unbalances_word_count = (sender_word_count.loc['s1']-sender_word_count.loc['s2']).abs()
unbalances_word_count.sort_values(ascending=False)[:5]
Out[6]:
In [29]:
sender_stats.getWordsUsedJustBy('s2', 's1').head(5)
Out[29]:
Consider words usage along time aggregated by months.
In [30]:
# load month stats
month_stats = conv.stats.generateStats(IConvStats.STATS_NAME_WORDCOUNT, groupByColumns=['year', 'month'])
month_word_count = month_stats.wordsCount
month_word_count.head()
#filepath = conv.statsFolder + '\\' + 'wordCount.txt'
#mio.printDataFrameToFile(stats, filepath)
Out[30]:
In [31]:
tot_month_word_count = month_stats.getWordsCount(['17','your','youth'])
tot_month_word_count.head()
Out[31]:
In [32]:
mplot.plotWordsCount(month_stats, ['17','your','youth'])
In [33]:
mplot.plotWordsCount(month_stats, ['17','your','youth'], sender='s1')
In [41]:
# get data to plot (target words)
target_words = ['your','youth']
stats_to_plot = statsUtil.transformStats(month_word_count[target_words], 'word', 'val')
In [42]:
mplot._genericFactorPlot(stats_to_plot, 'month', 'val', {}, "Word count", "count", col='year', row='word')
Consider words usage along time aggregated by hour.
In [37]:
# load hours stats
hour_stats = conv.stats.generateStats(IConvStats.STATS_NAME_WORDCOUNT, groupByColumns=['hour'])
hour_word_count = hour_stats.wordsCount
hour_word_count.head(3)
Out[37]:
In [38]:
# plot total for target words
mplot.plotWordsCount(hour_stats, ['17','your','youth'])
In [43]:
# get data to plot (target words)
target_words = ['17','your','youth']
stats_to_plot = statsUtil.transformStats(hour_word_count[target_words], 'word', 'val')
In [44]:
mplot._genericFactorPlot(stats_to_plot, 'hour', 'val', {}, "", "", row='word')
About words usage change along time.
In [46]:
# load month stats
month_stats = conv.stats.generateStats(IConvStats.STATS_NAME_WORDCOUNT, groupByColumns=['year', 'month'])
month_word_count = month_stats.wordsCount
In [47]:
total_month_word_trend = month_stats._computeWordsTrend(month_word_count)
In [48]:
total_month_word_trend.head()
Out[48]:
In [55]:
change_threshold = 20
total_month_word_trend[(total_month_word_trend>change_threshold)|(total_month_word_trend<(-change_threshold))]\
.dropna(axis=1, how='all').dropna(axis=0, how='all')
Out[55]:
In [56]:
total_month_word_trend.apply(lambda x:np.std(x), axis=0).sort_values(ascending=False)[:5]
Out[56]: